#! /usr/bin/env perl

use strict;
use warnings;
use Getopt::Long;

my ($help);
my $min_gap_len = 100;
my $max_gap     = 2;

GetOptions("m|min-gap=i" => \$min_gap_len,
           "M|max-gap=f" => \$max_gap,
           "h|help"      => \$help) or die;

my $usage = <<EOS;
$0 unitig_len super_read.fa mer_len

In each PacBio read, for each gap, try to fill in the gap with super
reads using only k-unitigs overlap.

 -m,--min-gap-len LEN    Minimum gap len. Any smaller gap is assumed to be of size LEN ($min_gap_len)
 -M,--max-gap FLOAT      How many times larger than gap size do we search
 -h,--help               This message
EOS
    ;

if(defined($help)) {
  print($usage);
  exit(0);
}

if(@ARGV < 3) {
  print(STDERR $usage);
  exit(1);
}

my $unitig_len_file = shift @ARGV;
my $super_read_file = shift @ARGV;
my $mer_len         = shift @ARGV;

my @unitig_len;
my @unitig_sr;

# Read unitig lengths
open(my $ul_io, "<", $unitig_len_file) or
    die "Can't open file '$unitig_len_file': $!";
while(<$ul_io>) {
  my ($index, $len) = split;
  push(@unitig_len, $len);
}
close($ul_io);

# Read super read fasta file
open(my $sr_io, "<", $super_read_file) or
    die "Can't open file '$super_read_file': $!";
while(<$sr_io>) {
  next unless /^>(\S+)/;
  my $sr_name = $1;
  my @unitigs = split(/_/, $sr_name);
  foreach my $unitig (@unitigs) {
    push(@{$unitig_sr[substr($unitig, 0, -1)]}, $sr_name);
  }
}
close($sr_io);

# Reverse a super read name
sub reverse_sr {
  my $r = join("_", reverse(split(/_/, $_[0])));
  $r =~ y/FR/RF/;
  $r
}

# Given a super read name, returns the name of the first unitig
sub first_unitig {
  $_[0] =~ /^([^_]+)(?:_|$)/ and return $1;
  return undef;
}

# Given a super read name, returns the name of the last unitig
sub last_unitig {
  $_[0] =~ /(?:^|_)([^_]+)$/ and return $1;
  return undef;
}

# Given two super read name sr1 sr2, it returns the unitigs in the
# overlap between sr1 and sr2, or undef if there is no overlap. The
# last unitig of sr1 can be passed as the last argument.
sub unitig_overlap {
  my ($sr1, $sr2, $last_unitig1) = @_;
  if(!defined($last_unitig1)) {
    $last_unitig1 = last_unitig($sr1);
    return undef unless defined($last_unitig1);
  }
  return undef unless $sr2 =~ /(?:^|_)($last_unitig1)(?:_|$)/;
  my $common_len     = $+[1];
  my $common_unitigs = substr($sr1, -$common_len);
  return undef unless $common_unitigs eq substr($sr2, 0, $common_len);
  return $common_unitigs;
}

sub length_sr {
  my ($sr)    = @_;
  my @unitigs = split(/_/, $sr);
  my $len     = 0;
  $len += $unitig_len[substr($_, 0, -1)] foreach (@unitigs);
  $len - (@unitigs - 1) * ($mer_len - 1);
}

sub print_path {
  my ($pbr, $sr_path, $len) = @_;
  print("$pbr $len");
  print(" $_") foreach (@$sr_path);
  print("\n");
}

sub get_distinct_candidates {
  my ($start_sr, $end_sr, $candidates) = @_;
  my %unique_path;

  foreach my $cpath (@$candidates) {
    my ($sr_path, $len) = @$cpath;
    my $path;
    my $nb_unitigs;
    if(@$sr_path) {
      my $prev_sr = shift(@$sr_path);
      my $start_overlap = unitig_overlap($start_sr, $prev_sr);
      $path = substr($prev_sr, length($start_overlap) + 1);
      foreach my $sr (@$sr_path) {
        my $overlap = unitig_overlap($prev_sr, $sr);
        $path       = $path . substr($sr, length($overlap));
        $prev_sr    = $sr;
      }
      my $end_overlap = unitig_overlap($prev_sr, $end_sr);
      $path = substr($path, 0, -(length($end_overlap) + 1));
      my @unitigs = split(/_/, $path);
      $nb_unitigs = scalar(@unitigs);
    } else {
      $path = "";
      my @unitigs = split(/_/, substr($end_sr, 0, -$len));
      $nb_unitigs = scalar(@unitigs);
    }
    $unique_path{$path} = [$len, $nb_unitigs];
  }
  return %unique_path;
}

sub fill_gap {
  my ($pbr, $start_sr, $end_sr, $gap_len) = @_;
  my %seen;
  my @candidates; # Possible paths found

# Queue contains pairs of (super_read, length of path)
  my @queue;
  push(@queue, [[$start_sr], 1]);

  while(@queue) {
    return if(@queue > 10);
    my ($sr_path, $len) = @{shift(@queue)};
    my $sr              = $$sr_path[-1];
    my @unitigs         = split(/_/, $sr);


    my $common_unitigs = unitig_overlap($sr, $end_sr, $unitigs[-1]);
    if($common_unitigs) { # Found and overlap with target. Print and done.
      $len -= length_sr($common_unitigs) - ($mer_len - 1);
      if($max_gap * $gap_len >= $len) {
        my @found_path = @$sr_path;
#        push(@found_path, $end_sr);
        shift(@found_path); # remove first super read. It is always $start_sr
        push(@candidates, [\@found_path, $len]);
      }
      next;
    }
    next if $len > $max_gap * $gap_len; # Length already too much, give up

    # Find all the possible non-containing overlaps with other super reads (or their
    # reverse complement), and append to the end of the queue
    for(my $i = 1; $i < @unitigs; $i++) {
      my $unitig_id = substr($unitigs[$i], 0, -1);
      foreach my $nsr (@{$unitig_sr[$unitig_id]}) {
        next if $seen{$nsr};
        $seen{$nsr} = 1;
        $common_unitigs = unitig_overlap($sr, $nsr, $unitigs[-1]);
        unless($common_unitigs) {
          $nsr            = reverse_sr($nsr);
          $common_unitigs = unitig_overlap($sr, reverse_sr($nsr), $unitigs[-1]);
        }
        next unless $common_unitigs;
        next if length($common_unitigs) == length($nsr);

        my $add_unitigs = substr($nsr, length($common_unitigs) + 1);
        my $nlen        = $len - ($mer_len - 1) + length_sr($add_unitigs);
        push(@queue, [[@$sr_path, $nsr], $nlen]);
      }
    }
  }

  return @candidates;
}

my $pb_name;
my @prev_line;
while(<>) {
  if(/^>(.+)$/) {
    $pb_name   = $1;
    @prev_line = ();
    next;
  }
  # 0, 1 => position of match
  # 2, 3 => pac bio read name and sequence of mega read
  # 4    => mega read name
  my @line = split;
  if(@prev_line) {
    my ($start_gap, $end_gap, $pbr, $start_sr, $end_sr) =
        ($prev_line[1], $line[0], $pb_name, $prev_line[4], $line[4]);
    if($start_sr ne $end_sr) {
      my $gap_len = $end_gap - $start_gap + 1;
      $gap_len = $min_gap_len if $gap_len < $min_gap_len;
      my @candidates = fill_gap($pbr, $start_sr, $end_sr, $gap_len);
      my %distinct_candidates = get_distinct_candidates($start_sr, $end_sr, \@candidates);
      if(keys %distinct_candidates == 1) {
        while(my ($path, $lens) = each %distinct_candidates) {
          print("+ $pbr $start_gap $end_gap $$lens[0] $$lens[1] $path\n");
        }
      } else {
        print("- $pbr $start_gap $end_gap ", scalar(keys(%distinct_candidates)), "\n");
      }
    }
#    print("-> ", scalar(keys %distinct_candidates), "\n");
    # print_candidates($pbr, $start_sr, $end_sr, \@candidates);
  }
  @prev_line = @line;
}
